# Operationalize state capacity
# (consider also GDD/cap but see Hendrix 2010!)

##
## ICRG Data 
##

# extract ICRG Data from Hegre et.al 2014
# icrg <- read.dta("Daten/PRS International Country Risk Guide Data/governanceandconflictrelapsereplicationdata/governancedata.dta")
# icrg <- subset(icrg, subset = year > 1988 & year < 2010, select = c(country, year, icrgBQ, icrgCorruption, icrgMilPolitics))
# 
# icrg$country[icrg$country=="Serbia"] <- "Yugoslavia"
# icrg$country[icrg$country=="Taiwan, China"] <- "Taiwan"
# icrg$country <- countrycode(icrg$country, "country.name", "cown", warn = T)
# 
# icrg <- rename(icrg, c("country"= "actorid", "icrgBQ" = "icrg_bq", "icrgCorruption" = "icrg_corruption", "icrgMilPolitics"= "icrg_milpol"), warn_missing = T)
# icrg <- icrg[!is.na(icrg$actorid),]
# save(icrg, file = "Daten/PRS International Country Risk Guide Data/ICRG_1989-2009.RData")

load("Daten/PRS International Country Risk Guide Data/ICRG_1989-2009.RData")
Master <- join(Master, icrg, type="left", by=c("actorid", "year"), match="all")


##
## World Bank (from WDI) Total Tax / GDP
##

wdilist <- c("GC.TAX.TOTL.GD.ZS")    # Tax revenue (% of GDP)
             
# Extract latest version of desired variables from WDI.
wdi <- WDI(country="all", indicator = wdilist, extra = FALSE,
           start = 1989, end = 2009)

wdi$actorid <- countrycode(wdi$iso2c, "iso2c", "cown", warn = T)
wdi$actorid[wdi$country=="Kosovo"] <- 347 # Kosovo 
wdi$actorid[wdi$country=="Serbia"] <- 345 # Serbia (Yugoslavia)
wdi$actorid[wdi$country=="South Sudan"] <- 626 # South Sudan (to be sure that it does not match Sudan)
wdi$actorid[wdi$country=="Yemen, Rep."] <- 678 # used in SVAC for Yemen (In CoW: Yemen Arab Republic!)
wdi$actorid[wdi$country=="South Africa"] <- 560 # South Africa
wdi$actorid[wdi$country=="Zimbabwe"] <- 552 # Zimbabwe

wdi <- rename(wdi, c("GC.TAX.TOTL.GD.ZS"="wb_taxgdp"))
wdi <- subset(wdi, is.na(actorid)==FALSE, 
              select=c(country, actorid, year, wb_taxgdp))


# load replication data from Hendrix (2010) to merge tax/gdp data in
hendrix <- read.dta("Daten/World Bank Tax Revenue/Hendrix 2010 Replicationdata/Hendrix_JPR_10_replication.dta")
hendrix$actorid <- countrycode(hendrix$ccode, "cown", "cown", warn = T) # add COW-IDs 
hendrix <- subset(hendrix, select = c(ccode, year, taxratio))
hendrix <- rename(hendrix, c("ccode" = "actorid", "taxratio" = "hendrix_taxgdp"))
hendrix$hendrix_taxgdp <- hendrix$hendrix_taxgdp*100
wdi <- join(wdi, hendrix, type="full", by=c("actorid", "year"), match="all")

# load replication data from Englehart (2009) to merge tax/gdp data in
englehart <- read.dta("Daten/World Bank Tax Revenue/Englehart 2009 Replicationdata/Englehart_StatesandRights.dta")
englehart$actorid <- countrycode(englehart$country, "country.name", "cown", warn = T) # add COW-IDs 
englehart$actorid[englehart$country=="Congo-Brazzaville"] <- 484 # Congo, Republic Of
englehart$actorid[englehart$country=="Congo-Kinshasa"] <- 490 # Democratic Rep. of. Congo
englehart$actorid[englehart$country=="Czechoslovakia"] <- 315 # replace 317 (=="Slovakia")
englehart <- subset(englehart, select = c(actorid, year, taxgdp_best))
englehart <- rename(englehart, c("taxgdp_best" = "englehart_taxgdp"))
englehart$englehart_taxgdp <- englehart$englehart_taxgdp*100
wdi <- join(wdi, englehart, type="full", by=c("actorid", "year"), match="all")

# restrict years
wdi <- subset(wdi, subset = wdi$year > 1988 & wdi$year < 2010)

# Teste Übereinstimmung der drei Datensätze
# cor(wb_tax$wb_taxgdp, wb_tax$hendrix_taxgdp, use = "pair")
# cor(wb_tax$wb_taxgdp, wb_tax$englehart_taxgdp, use = "pair")
# cor(wb_tax$englehart_taxgdp, wb_tax$hendrix_taxgdp, use = "pair")

# combine them; Rule
#   (1) take WB value first
#   (2) if NA, add Englehart value
#   (3) if still NA, add Hendrix value
wdi$taxgdp <- wdi$wb_taxgdp
# length(wb_tax$taxgdp[!is.na(wb_tax$taxgdp)]) # Wieviele Fälle vor dieser Imputation? # 1850
wdi[is.na(wdi$wb_taxgdp),]$taxgdp <- wdi[is.na(wdi$wb_taxgdp),]$englehart_taxgdp
# length(wb_tax$taxgdp[!is.na(wb_tax$taxgdp)]) # Wieviele Fälle nach dieser Imputation? # 3279
wdi[is.na(wdi$taxgdp),]$taxgdp <- wdi[is.na(wdi$taxgdp),]$hendrix_taxgdp
# length(wb_tax$taxgdp[!is.na(wb_tax$taxgdp)]) # Wieviele Fälle nach dieser Imputation? # 3354

# Teste Übereinstimmung der drei Datensätze mit Kombination
# cor(wdi$taxgdp, wdi$hendrix_taxgdp, use = "pair") # .58
# cor(wdi$taxgdp, wdi$englehart_taxgdp, use = "pair") # .87
# cor(wdi$taxgdp, wdi$wb_taxgdp, use = "pair") # 1.0

# Join to Master
wdi <- subset(wdi, select = c(actorid, year, taxgdp))
wdi <- wdi[!duplicated(wdi),]
Master <- join(Master, wdi, type="left", by=c("actorid", "year"), match="all")


# - obsolete 
# wb_tax <- read.csv(file = "Daten/World Bank Tax Revenue/taxrevenue.csv", header = T, sep = ",", as.is = TRUE)
# wb_tax$Country.Name[wb_tax$Country.Name=="Serbia"] <- "Yugoslavia"
# wb_tax$actorid <- countrycode(wb_tax$Country.Name, "country.name", "cown",warn = T) # add COW-IDs
# #wb_tax <- subset(wb_tax, subset = !is.na(wb_tax$actorid)) # delete all rows w/o COW-IDs
# 
# wb_tax <- subset(wb_tax, subset = !is.na(wb_tax$actorid), select = c("actorid", "X1989", "X1990", "X1991", "X1992", 
#                                                 "X1993", "X1994", "X1995", "X1996", "X1997", "X1998", 
#                                                 "X1999", "X2000", "X2001", "X2002", "X2003", "X2004", 
#                                                 "X2005", "X2006", "X2007", "X2008" ,"X2009"))
# 
# wb_tax <- rename(wb_tax, c("actorid"="actorid","X1989"="1989",
#                                        "X1990"="1990","X1991"="1991","X1992"="1992","X1993"="1993","X1994"="1994",
#                                        "X1995"="1995","X1996"="1996","X1997"="1997","X1998"="1998","X1999"="1999",
#                                        "X2000"="2000","X2001"="2001","X2002"="2002","X2003"="2003","X2004"="2004",
#                                        "X2005"="2005","X2006"="2006","X2007"="2007","X2008"="2008","X2009"="2009"))
# 
# wb_tax <- melt(wb_tax, id.vars=c("actorid")) # from long to wide format
# wb_tax <- rename(wb_tax, c("variable"="year", "value"="wb_taxgdp"))
# wb_tax$year <- as.numeric(as.character(wb_tax$year))


##
## PITF Data (MAGFAIL)
##
pitf <- read.xls(xls = "Daten/PITF Political Instability Task Force/PITF Adverse Regime Change 2013.xls", sheet = 1)
pitf <- subset(pitf, subset = pitf$YEAR > 1988 & pitf$YEAR < 2010, select = c(CCODE, YEAR, MAGFAIL))
pitf <- rename(pitf, c("YEAR" = "year", "CCODE" = "actorid", "MAGFAIL"= "pitf_magfail"))
Master <- join(Master, pitf, type="left", by=c("actorid", "year"), match="all")
Master[is.na(Master$pitf_magfail),]$pitf_magfail <- 0 # see Cohen (2013: 468) for this additional transformation


rm(englehart, hendrix, icrg, pitf, wdi, wdilist, wiid)